knitr::opts_chunk$set(echo = TRUE)

Required Packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
library(jsonlite)
library(httr)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
## 
##     wind

Reading the Data from the API and storing it to CSV file. (Done only once)

system("curl 'https://data.cityofnewyork.us/resource/76xm-jjuj.csv?$limit=2300000' -o nyc_ems.csv")

Reading the data from csv created above.

data <- read.csv("nyc_ems.csv")
head(data)
##   cad_incident_id       incident_datetime initial_call_type
## 1       220904738 2022-03-31T23:59:52.000            STNDBY
## 2       220904735 2022-03-31T23:59:45.000              DRUG
## 3       220904734 2022-03-31T23:59:02.000              DRUG
## 4       220904733 2022-03-31T23:58:37.000            TRAUMA
## 5       220904732 2022-03-31T23:58:19.000              SICK
## 6       220904731 2022-03-31T23:58:10.000            INJURY
##   initial_severity_level_code final_call_type final_severity_level_code
## 1                           8          STNDBY                         8
## 2                           4            DRUG                         4
## 3                           4            DRUG                         4
## 4                           2          TRAUMA                         2
## 5                           6            SICK                         6
## 6                           5          INJURY                         5
##   first_assignment_datetime valid_dispatch_rspns_time_indc
## 1                                                        N
## 2   2022-03-31T23:59:45.000                              Y
## 3   2022-03-31T23:59:13.000                              Y
## 4   2022-03-31T23:59:10.000                              Y
## 5   2022-03-31T23:59:20.000                              Y
## 6   2022-03-31T23:58:44.000                              Y
##   dispatch_response_seconds_qy first_activation_datetime
## 1                            0                          
## 2                            0   2022-03-31T23:59:45.000
## 3                           11   2022-03-31T23:59:19.000
## 4                           33   2022-03-31T23:59:47.000
## 5                           61   2022-03-31T23:59:34.000
## 6                           34   2022-03-31T23:58:53.000
##   first_on_scene_datetime valid_incident_rspns_time_indc
## 1                                                      N
## 2 2022-03-31T23:59:45.000                              Y
## 3 2022-04-01T00:02:02.000                              Y
## 4 2022-04-01T00:01:27.000                              Y
## 5 2022-04-01T00:17:19.000                              Y
## 6 2022-04-01T00:08:01.000                              Y
##   incident_response_seconds_qy incident_travel_tm_seconds_qy
## 1                           NA                            NA
## 2                            0                             0
## 3                          180                           169
## 4                          170                           137
## 5                         1140                          1079
## 6                          591                           557
##    first_to_hosp_datetime first_hosp_arrival_datetime incident_close_datetime
## 1                                                     2022-03-31T23:59:52.000
## 2 2022-04-01T00:01:50.000     2022-04-01T00:06:15.000 2022-04-01T00:45:52.000
## 3 2022-04-01T00:15:33.000     2022-04-01T00:20:22.000 2022-04-01T00:55:56.000
## 4                                                     2022-04-01T00:12:02.000
## 5 2022-04-01T00:23:06.000     2022-04-01T00:37:57.000 2022-04-01T01:05:19.000
## 6 2022-04-01T00:22:34.000     2022-04-01T00:34:26.000 2022-04-01T00:51:18.000
##   held_indicator incident_disposition_code   borough incident_dispatch_area
## 1              N                        NA MANHATTAN                     M7
## 2              N                        82     BRONX                     B2
## 3              N                        82 MANHATTAN                     M3
## 4              N                        90     BRONX                     B4
## 5              N                        82  BROOKLYN                     K4
## 6              N                        82 MANHATTAN                     M2
##   zipcode policeprecinct citycouncildistrict communitydistrict
## 1   10035             25                   8               111
## 2   10458             48                  15               206
## 3   10019             18                   3               104
## 4   10470             47                  11               212
## 5   11212             73                  41               316
## 6   10014              6                   3               102
##   communityschooldistrict congressionaldistrict reopen_indicator
## 1                       4                    12                N
## 2                      10                    15                N
## 3                       2                    10                N
## 4                      11                    16                N
## 5                      23                     9                N
## 6                       2                    10                N
##   special_event_indicator standby_indicator transfer_indicator
## 1                       N                 Y                  N
## 2                       N                 N                  N
## 3                       N                 N                  N
## 4                       N                 N                  N
## 5                       N                 N                  N
## 6                       N                 N                  N

#Checking the dimesnions of the data

dim(data)
## [1] 2300000      31

Cleaning process

colnames(data)
##  [1] "cad_incident_id"                "incident_datetime"             
##  [3] "initial_call_type"              "initial_severity_level_code"   
##  [5] "final_call_type"                "final_severity_level_code"     
##  [7] "first_assignment_datetime"      "valid_dispatch_rspns_time_indc"
##  [9] "dispatch_response_seconds_qy"   "first_activation_datetime"     
## [11] "first_on_scene_datetime"        "valid_incident_rspns_time_indc"
## [13] "incident_response_seconds_qy"   "incident_travel_tm_seconds_qy" 
## [15] "first_to_hosp_datetime"         "first_hosp_arrival_datetime"   
## [17] "incident_close_datetime"        "held_indicator"                
## [19] "incident_disposition_code"      "borough"                       
## [21] "incident_dispatch_area"         "zipcode"                       
## [23] "policeprecinct"                 "citycouncildistrict"           
## [25] "communitydistrict"              "communityschooldistrict"       
## [27] "congressionaldistrict"          "reopen_indicator"              
## [29] "special_event_indicator"        "standby_indicator"             
## [31] "transfer_indicator"

#Removing the unnecessary columns not required for our use case.

clean_data <- data %>% 
  select(-c(valid_dispatch_rspns_time_indc,first_activation_datetime,valid_incident_rspns_time_indc,borough,incident_dispatch_area,policeprecinct,citycouncildistrict,communitydistrict,communityschooldistrict,congressionaldistrict,first_on_scene_datetime,first_to_hosp_datetime,first_hosp_arrival_datetime))

#Renaming the column names accordingly for ease.

clean_data <- clean_data %>%
  rename(incident_id = cad_incident_id,initial_call_reason = initial_call_type,final_call_reason=final_call_type,time_elapsed_assignment=dispatch_response_seconds_qy,
         time_incident_response=incident_response_seconds_qy,time_incident_travel=incident_travel_tm_seconds_qy)

#Checking for all the NA values

sapply(clean_data, function(x) sum(is.na(x)))
##                 incident_id           incident_datetime 
##                           0                           0 
##         initial_call_reason initial_severity_level_code 
##                           0                           0 
##           final_call_reason   final_severity_level_code 
##                           0                           0 
##   first_assignment_datetime     time_elapsed_assignment 
##                           0                           0 
##      time_incident_response        time_incident_travel 
##                       68763                       68415 
##     incident_close_datetime              held_indicator 
##                           0                           0 
##   incident_disposition_code                     zipcode 
##                       13864                       33737 
##            reopen_indicator     special_event_indicator 
##                           0                           0 
##           standby_indicator          transfer_indicator 
##                           0                           0

#Removing rows having NA values as we would still have a large sample of data.

clean_data<- na.omit(clean_data)

#Checking the dimesnion of cleaned data

dim(clean_data)
## [1] 2198972      18

#Converting columns in data-time format as they were in string format.

clean_data <- clean_data %>%
  mutate(incident_datetime = ymd_hms(incident_datetime),
         incident_close_datetime = ymd_hms(incident_close_datetime),
         first_assignment_datetime=ymd_hms(first_assignment_datetime),)
# count number of rows where incident_close_datetime is before or equal to incident_datetime
sum(clean_data$first_assignment_datetime < clean_data$incident_datetime, na.rm = TRUE)
## [1] 0
# count number of rows where incident_close_datetime is before or equal to incident_datetime
sum(clean_data$incident_close_datetime < clean_data$first_assignment_datetime, na.rm = TRUE)
## [1] 52
clean_data <- clean_data %>%
  filter(incident_close_datetime > first_assignment_datetime)
# Count the number of duplicate incident IDs
sum(duplicated(clean_data$incident_id))
## [1] 14
# Remove the duplicate incidents
clean_data <- distinct(clean_data, incident_id, .keep_all = TRUE)
clean_data$initial_severity_level_code <- as.integer(clean_data$initial_severity_level_code)
clean_data$final_severity_level_code <- as.integer(clean_data$final_severity_level_code)
# Check for out-of-range values in initial_severity_level_code
sum(!between(clean_data$initial_severity_level_code, 1, 8))
## [1] 7
# Check for out-of-range values in final_severity_level_code
sum(!between(clean_data$final_severity_level_code, 1, 8))
## [1] 0
# Filter the rows where initial_severity_level_code is out of range
clean_data %>% 
  filter(initial_severity_level_code < 1 | initial_severity_level_code > 8) %>%
  select(initial_severity_level_code)
##   initial_severity_level_code
## 1                           9
## 2                           9
## 3                           9
## 4                           9
## 5                           9
## 6                           9
## 7                           9
clean_data <- clean_data %>% 
  filter(initial_severity_level_code >= 1 & initial_severity_level_code <= 8)
table(clean_data$held_indicator[!clean_data$held_indicator %in% c("N", "Y")])
## < table of extent 0 >
table(clean_data$reopen_indicator[!clean_data$reopen_indicator %in% c("N", "Y")])
## < table of extent 0 >
table(clean_data$special_event_indicator[!clean_data$special_event_indicator %in% c("N", "Y")])
## < table of extent 0 >
table(clean_data$standby_indicator[!clean_data$standby_indicator %in% c("N", "Y")])
## < table of extent 0 >
table(clean_data$transfer_indicator[!clean_data$transfer_indicator %in% c("N", "Y")])
## < table of extent 0 >
# Define the regex pattern for a valid US zip code
zip_pattern <- "^[0-9]{5}(?:-[0-9]{4})?$"

# Apply the regex pattern to the zipcode column in clean_data
length(which(!str_detect(clean_data$zipcode, zip_pattern)))
## [1] 3
clean_data %>%
  filter(!str_detect(zipcode, "^\\d{5}(-\\d{4})?$")) %>%
  select(zipcode)
##   zipcode
## 1      83
## 2      83
## 3      83
clean_data <- clean_data %>% 
             filter(grepl("^\\d{5}(-\\d{4})?$", zipcode))
clean_data$time_elapsed_assignment <- as.integer(clean_data$time_elapsed_assignment)
clean_data$time_incident_response<- as.integer(clean_data$time_incident_response)
clean_data$time_incident_travel <- as.integer(clean_data$time_incident_travel)
clean_data <- subset(clean_data, incident_disposition_code %in% c("82", "83","87","90","91","92","93","94","95","96","CANCEL","DUP","NOTSNT","ZZZZZZ"))
code_meaning_map <- c("82" = "transporting patient", "83" = "patient pronounced dead", "87" = "cancelled", "90" = "unfounded", "91" = "condition corrected", "92" = "treated not transported", "93" = "refused medical aid", "94" = "treated and transported", "95" = "triaged at scene no transport", "96" = "patient gone on arrival", "CANCEL" = "cancelled", "DUP" = "duplicate incident", "NOTSNT" = "unit not sent", "ZZZZZZ" = "no disposition")
clean_data <- mutate(clean_data, incident_disposition_code_meaning = code_meaning_map[as.character(incident_disposition_code)])
str(clean_data)
## 'data.frame':    2198698 obs. of  19 variables:
##  $ incident_id                      : int  220904735 220904734 220904733 220904732 220904731 220904730 220904729 220904728 220904727 220904726 ...
##  $ incident_datetime                : POSIXct, format: "2022-03-31 23:59:45" "2022-03-31 23:59:02" ...
##  $ initial_call_reason              : chr  "DRUG" "DRUG" "TRAUMA" "SICK" ...
##  $ initial_severity_level_code      : int  4 4 2 6 5 2 3 3 2 7 ...
##  $ final_call_reason                : chr  "DRUG" "DRUG" "TRAUMA" "SICK" ...
##  $ final_severity_level_code        : int  4 4 2 6 5 2 3 3 2 7 ...
##  $ first_assignment_datetime        : POSIXct, format: "2022-03-31 23:59:45" "2022-03-31 23:59:13" ...
##  $ time_elapsed_assignment          : int  0 11 33 61 34 24 10 26 22 11 ...
##  $ time_incident_response           : int  0 180 170 1140 591 409 359 433 455 203 ...
##  $ time_incident_travel             : int  0 169 137 1079 557 385 349 407 433 192 ...
##  $ incident_close_datetime          : POSIXct, format: "2022-04-01 00:45:52" "2022-04-01 00:55:56" ...
##  $ held_indicator                   : chr  "N" "N" "N" "N" ...
##  $ incident_disposition_code        : int  82 82 90 82 82 82 82 82 90 82 ...
##  $ zipcode                          : int  10458 10019 10470 11212 10014 11373 10003 10027 11375 10016 ...
##  $ reopen_indicator                 : chr  "N" "N" "N" "N" ...
##  $ special_event_indicator          : chr  "N" "N" "N" "N" ...
##  $ standby_indicator                : chr  "N" "N" "N" "N" ...
##  $ transfer_indicator               : chr  "N" "N" "N" "N" ...
##  $ incident_disposition_code_meaning: Named chr  "transporting patient" "transporting patient" "unfounded" "transporting patient" ...
##   ..- attr(*, "names")= chr [1:2198698] "82" "82" "90" "82" ...
write.csv(clean_data, file = "clean_data.csv", row.names = TRUE)

Analysis

UseCase -1

#We need to catagorize the call time in 4 categorize: Morning, Noon, Evening & Night.

# Create function to categorize call time
categorize_call_time <- function(call_time) {
  hour <- as.numeric(format(call_time, "%H"))
  if (hour >= 5 && hour < 12) {
    return("Morning")
  } else if (hour >= 12 && hour < 17) {
    return("Noon")
  } else if (hour >= 17 && hour < 21) {
    return("Evening")
  } else {
    return("Night")
  }
}

# Apply function to incident_datetime column to create new column
clean_data$call_time_category <- sapply(clean_data$incident_datetime, categorize_call_time)

#Displaying the time category having the most incident_datetime.

clean_data %>% 
  group_by(call_time_category) %>% 
  summarize(count = n()) %>% 
  arrange(desc(count))
## # A tibble: 4 × 2
##   call_time_category  count
##   <chr>               <int>
## 1 Night              604854
## 2 Noon               585859
## 3 Morning            554847
## 4 Evening            453138

#Plotting a pie chart showing % of incidents in each call time category.

# Calculate percentages for each category
percentages <- clean_data %>%
  group_by(call_time_category) %>%
  summarize(count = n()) %>%
  mutate(percent = count / sum(count) * 100)

# Create pie chart with percentages
ggplot(percentages, aes(x = "", y = percent, fill = call_time_category)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y", start = 0) +
  theme_void() +
  theme(legend.position = "bottom") +
  geom_text(aes(label = paste0(round(percent), "%")), position = position_stack(vjust = 0.5)) +
  labs(title = "Call Time Category Distribution", fill = "Call Time Category")

UseCase-2

# Group the data by time category
response_time_by_time_category <- clean_data %>%
  group_by(call_time_category) %>%
  # Calculate the average response time
  summarize(avg_response_time = mean(time_incident_response, na.rm = TRUE)) %>%
  # Sort the data by average response time in ascending order
  arrange(avg_response_time)

# View the resulting data
response_time_by_time_category
## # A tibble: 4 × 2
##   call_time_category avg_response_time
##   <chr>                          <dbl>
## 1 Evening                         503.
## 2 Noon                            510.
## 3 Morning                         511.
## 4 Night                           528.
ggplot(clean_data, aes(x = call_time_category, y = time_incident_response)) +
  geom_boxplot() +
  labs(x = "Call Time Category", y = "Time Incident Response (seconds)",
       title = "Distribution of Time Incident Response by Call Time Category")

We can see that these points are outliers. Although the outliers may be special cases, we considered to move forward without them as we wanted to find general patterns.

UseCase - 3

#We are going to analysis the average response and assignment times for different time categories of calls. The response time is the time between the initial call and the first unit being assigned to the incident, while the assignment time is the time elapsed from when the incident is created to the first unit being assigned.

response_time_by_time_category <- clean_data %>%
  group_by(call_time_category) %>%
  # Calculate the average response time
  summarize(avg_response_time = mean(time_incident_response, na.rm = TRUE)) %>%
  # Sort the data by average response time in ascending order
  arrange(avg_response_time)

# View the resulting data
response_time_by_time_category
## # A tibble: 4 × 2
##   call_time_category avg_response_time
##   <chr>                          <dbl>
## 1 Evening                         503.
## 2 Noon                            510.
## 3 Morning                         511.
## 4 Night                           528.

#Bar plot showing the average response time for different time categories of calls.

ggplot(response_time_by_time_category, aes(x = call_time_category, y = avg_response_time)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Average Response Time by Time Category",
       x = "Time Category",
       y = "Average Response Time (in seconds)")

assignment_time_by_time_category <- clean_data %>%
  group_by(call_time_category) %>%
  # Calculate the average response time
  summarize(avg_assignment_time = mean(time_elapsed_assignment, na.rm = TRUE)) %>%
  # Sort the data by average response time in ascending order
  arrange(avg_assignment_time)

# View the resulting data
assignment_time_by_time_category
## # A tibble: 4 × 2
##   call_time_category avg_assignment_time
##   <chr>                            <dbl>
## 1 Evening                           47.4
## 2 Morning                           47.7
## 3 Noon                              49.8
## 4 Night                             70.6
ggplot(assignment_time_by_time_category, aes(x = call_time_category, y = avg_assignment_time)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Average Assignment Time by Time Category",
       x = "Time Category",
       y = "Average Assignment Time (in seconds)")

UseCase-4

#In this use case,the code performs various data analysis and visualization tasks related to call reasons in the dataset clean_data. Key findings such as the top 10 initial call reasons and their frequencies, the number of times initial_call_reason and final_call_reason are not the same, and the top final_call_reasons for specific initial_call_reasons. Plots are generated for each finding

freq_initial_call_reason <- clean_data %>%
  group_by(initial_call_reason) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

top_10_initial_call_reasons <- freq_initial_call_reason %>%
  slice_max(count, n = 10)

ggplot(top_10_initial_call_reasons, aes(x = reorder(initial_call_reason, count), y = count)) +
  geom_col(fill = "Blue", color = "Black", width = 0.5) +
  coord_flip() +
  labs(title = "Top 10 Initial Call Reasons",
       x = "Initial Call Reason",
       y = "Count")

sum(clean_data$initial_call_reason != clean_data$final_call_reason)
## [1] 292790
clean_data %>%
  group_by(initial_call_reason) %>%
  summarise(num_updates = sum(initial_call_reason != final_call_reason)) %>%
  arrange(desc(num_updates))
## # A tibble: 121 × 2
##    initial_call_reason num_updates
##    <chr>                     <int>
##  1 EDP                       64762
##  2 UNKNOW                    42141
##  3 INJURY                    23737
##  4 SICK                      19523
##  5 UNC                       16076
##  6 CARD                      10138
##  7 DRUG                       8157
##  8 DIFFBR                     7901
##  9 CVAC                       6759
## 10 EDPC                       6715
## # ℹ 111 more rows
clean_data %>%
  filter(initial_call_reason == "UNKNOW") %>%
  group_by(final_call_reason) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(10) %>%
  select(-1)
## Adding missing grouping variables: `final_call_reason`
## # A tibble: 10 × 2
## # Groups:   final_call_reason [10]
##    final_call_reason      n
##    <chr>              <int>
##  1 UNKNOW            182473
##  2 SICK                5433
##  3 DRUG                4351
##  4 CARD                3632
##  5 UNC                 3630
##  6 INJURY              3524
##  7 ALTMEN              3140
##  8 EDP                 2056
##  9 ARREST              1548
## 10 INJMAJ              1392
clean_data %>%
  filter(initial_call_reason == "EDP") %>%
  group_by(final_call_reason) %>%
  count() %>%
  arrange(desc(n))
## # A tibble: 87 × 2
## # Groups:   final_call_reason [87]
##    final_call_reason      n
##    <chr>              <int>
##  1 EDP               100731
##  2 EDPC               36017
##  3 EDPM               11844
##  4 DRUG                7614
##  5 SICK                1687
##  6 INJURY              1469
##  7 UNKNOW              1359
##  8 ALTMEN               934
##  9 MCI43                433
## 10 UNC                  372
## # ℹ 77 more rows
get_final_reason_data <- function(initial_reason) {
  final_reason_data <- clean_data %>%
    filter(initial_call_reason == initial_reason) %>%
    group_by(final_call_reason) %>%
    count() %>%
    arrange(desc(n)) %>%
    head(10) %>%
    filter(final_call_reason != initial_reason)
  
  ggplot(final_reason_data, aes(x = reorder(final_call_reason, -n), y = n, fill = final_call_reason)) +
    geom_bar(stat = "identity") +
    scale_fill_brewer(palette = "Set1") +
    theme_minimal() +
    labs(title = paste0(initial_reason," Calls by Final Reason"),
         x = "Final Call Reason",
         y = "Count")
}
# Call the function with input "EDP"
get_final_reason_data("EDP")

get_final_reason_data("UNKNOW")

get_final_reason_data("INJURY")

UseCase-5

#For this use case, zipcodes were identified according to the highest incident count and also according to the highest response times. This allows us to find the hotstop zones by incidents and by response times. The zones are then plotted on the google map using google cloud API key

UseCase - 6

#In this use case, we are generating a pie chart to show the distribution of incident_disposition_code_meaning in the clean_data dataset. The pie chart is a useful way to visualize the relative frequencies of different incident_disposition_code_meanings in the dataset and also supports our finding.

incident_disposition_count <- table(clean_data$incident_disposition_code_meaning)
incident_disposition_percent <- round(prop.table(incident_disposition_count) * 100, 2)

plot_ly(labels = names(incident_disposition_count), values = incident_disposition_count, type = "pie",
                     textinfo = "label+percent", 
                     text = paste(names(incident_disposition_count), "(", incident_disposition_percent, "%)"))